Regression Dicontinuity Design

Causality at the Cutoff

Bogdan G. Popescu

John Cabot University

Table of Contents

Arbitrary Cutoffs and Causal Inferences RDD Assumptions

Intro

How can we identify causal effects using observational data?

Some of the ways to conduct causal analysis with observational data, we need to run

  • differences in differences
  • regression discontinuity design

RDD

RDD are about arbitrary rules and thresholds determining access to policy programs

  • subjects are in the program if they have scores above a specific threshold
  • subject are not in the program if they have scores below a specific threshold

Or the other way around

Running or forcing variable - index that measures eligibility for the program (e.g. scores)
Cutoff / cutpoint / threshold - the number that the determines access to the program

RDD

RDD

Student ID Exam Score Scholarship Awarded? First-Year GPA
101 83 No 3.2
102 84 No 3.3
103 85 Yes 3.6
104 86 Yes 3.5
105 87 Yes 3.7
106 84.5 No 3.4
107 85.1 Yes 3.6

RDD

Hypothetical tutoring program

Students take an entrance exam

Those who score 70 or lower get a free tutor for the year

Students then take an exit exam at the end of the year

RDD

Hypothetical tutoring program

Show the code
# Fake program data
set.seed(1234)
num_students <- 1000
tutoring <- tibble(
  id = 1:num_students,
  entrance_exam = rbeta(num_students, shape1 = 7, shape2 = 2),
  exit_exam = rbeta(num_students, shape1 = 5, shape2 = 3)
) |> 
  mutate(entrance_exam = entrance_exam * 100,
         tutoring = entrance_exam <= 70) |> 
  mutate(exit_exam = exit_exam * 40 + 10 * tutoring + entrance_exam / 2) |> 
  mutate(tutoring_fuzzy = ifelse(entrance_exam > 60 & entrance_exam < 80,
                                 sample(c(TRUE, FALSE), n(), replace = TRUE),
                                 tutoring)) |> 
  mutate(tutoring_text = factor(tutoring, levels = c(FALSE, TRUE),
                                labels = c("No tutor", "Tutor")),
         tutoring_fuzzy_text = factor(tutoring_fuzzy, levels = c(FALSE, TRUE),
                                      labels = c("No tutor", "Tutor"))) |> 
  mutate(entrance_centered = entrance_exam - 70)
Show the code
ggplot(tutoring, aes(x = entrance_exam, y = tutoring_text, fill = tutoring_text)) +
  geom_vline(xintercept = 70, linewidth = 2, color = "#FFDC00") + 
  geom_point(size = 3, pch = 21, color = "white", alpha = 0.4,
             position = position_jitter(width = 0, height = 0.15, seed = 1234)) + 
  labs(x = "Entrance exam score", y = NULL) + 
  guides(fill = "none") +
  scale_fill_manual(values = c("black", "red"), name = NULL) +
  theme_bw(base_size = 28)

Intuition

Hypothetical tutoring program

The people right before and right after the threshold are very similar

The idea is that we can measure the effect of the program by examining students just around the threshold

Show the code
ggplot(tutoring, aes(x = entrance_exam, y = tutoring_text, fill = tutoring_text)) +
  geom_vline(xintercept = 70, linewidth = 2, color = "#FFDC00") + 
    annotate(geom = "rect", fill = "grey50", alpha = 0.25, ymin = -Inf, ymax = Inf,
           xmin = 70 - 5,  xmax = 70 + 5) +
    annotate(geom = "rect", fill = "grey50", alpha = 0.5, ymin = -Inf, ymax = Inf,
           xmin = 70 - 2,  xmax = 70 + 2) +
  geom_point(size = 3, pch = 21, color = "white", alpha = 0.4,
             position = position_jitter(width = 0, height = 0.15, seed = 1234)) + 
  labs(x = "Entrance exam score", y = NULL) + 
  guides(fill = "none") +
  scale_fill_manual(values = c("black", "red"), name = NULL) +
  theme_bw(base_size = 28)

Intuition

Hypothetical tutoring program

The people right before and right after the threshold are very similar

The idea is that we can measure the effect of the program by examining students just around the threshold.

Show the code
ggplot(tutoring, aes(x = entrance_exam, y = tutoring_text, fill = tutoring_text)) +
  geom_vline(xintercept = 70, linewidth = 2, color = "#FFDC00") + 
    annotate(geom = "rect", fill = "grey50", alpha = 0.25, ymin = -Inf, ymax = Inf,
           xmin = 70 - 5,  xmax = 70 + 5) +
    annotate(geom = "rect", fill = "grey50", alpha = 0.5, ymin = -Inf, ymax = Inf,
           xmin = 70 - 2,  xmax = 70 + 2) +
  geom_point(size = 3, pch = 21, color = "white", alpha = 0.4,
             position = position_jitter(width = 0, height = 0.15, seed = 1234)) + 
  labs(x = "Entrance exam score", y = NULL) + 
  guides(fill = "none") +
  scale_fill_manual(values = c("black", "red"), name = NULL) +
  theme_bw(base_size = 28)+
    coord_cartesian(xlim = c(70 - 5, 70 + 5))

Intuition

Hypothetical tutoring program

Show the code
ggplot(tutoring, aes(x = entrance_exam, y = exit_exam, fill = tutoring_text)) +
  geom_point(size = 3, pch = 21, color = "white", alpha = 0.4)+
  geom_vline(xintercept = 70, size = 2, color = "#FFDC00") + 
  labs(x = "Entrance exam score", y = "Exit exam score") + 
  #guides(fill = "none") +
    guides(fill = guide_legend(reverse = TRUE)) +
  scale_fill_manual(values = c("black", "red"), name = NULL) +
  theme_bw(base_size = 28)+
    coord_sf(xlim = c(30, 100), 
           ylim = c(40, 90))

Intuition

Hypothetical tutoring program

Show the code
ggplot(tutoring, aes(x = entrance_exam, y = exit_exam, fill = tutoring_text)) +
  geom_point(size = 3, pch = 21, color = "white", alpha = 0.4)+
  geom_vline(xintercept = 70, size = 2, color = "#FFDC00") + 
  labs(x = "Entrance exam score", y = "Exit exam score") + 
  #guides(fill = "none") +
    guides(fill = guide_legend(reverse = TRUE)) +
  geom_smooth(data = filter(tutoring, entrance_exam <= 70),
              method = "lm",
              color="red",
              #aes(color = tutoring_text),
              size = 2, show.legend = FALSE) +
  geom_smooth(data = filter(tutoring, entrance_exam > 70),
              method = "lm", 
              color="black",
              #aes(color = tutoring_text),
              size = 2, show.legend = FALSE) +
    scale_fill_manual(values = c("black", "red"), name = NULL) +
  scale_color_manual(values = c("black", "red")) +
  theme_bw(base_size = 28)+
    coord_sf(xlim = c(30, 100), 
           ylim = c(40, 90))

Intuition

Hypothetical tutoring program

Show the code
library("broom")
tutoring <- tutoring |>
  mutate(tutoring = relevel(factor(tutoring), ref = "TRUE"))  # "TRUE" = treated is reference

rdd_tutoring <- lm(exit_exam ~ entrance_centered + tutoring, data = tutoring) |> 
  tidy()

effect_control <- filter(rdd_tutoring, term == "(Intercept)")$estimate
late <- filter(rdd_tutoring, term == "tutoringFALSE")$estimate
effect_treatment <- effect_control + late


ggplot(tutoring, aes(x = entrance_exam, y = exit_exam, fill = tutoring_text)) +
  geom_point(size = 3, pch = 21, color = "white", alpha = 0.4) +
  geom_vline(xintercept = 70, size = 2, color = "#FFDC00") + 
  labs(x = "Entrance exam score", y = "Exit exam score") + 
  #guides(fill = "none") +
    guides(fill = guide_legend(reverse = TRUE)) +
  geom_smooth(data = filter(tutoring, entrance_exam <= 70),
              method = "lm",
              color="red",
              #aes(color = tutoring_text),
              size = 2, show.legend = FALSE) +
  geom_smooth(data = filter(tutoring, entrance_exam > 70),
              method = "lm", 
              color="black",
              #aes(color = tutoring_text),
              size = 2, show.legend = FALSE) +
  scale_fill_manual(values = c("black", "red"), name = NULL) +
  scale_color_manual(values = c("black", "red")) +
  theme_bw(base_size = 28)+
  annotate(geom = "segment", x = 70, xend = 70, 
           y = effect_control, yend = effect_treatment,
           size = 4, color = "red") +
  annotate(geom = "label", x = 75, y = effect_treatment - (late / 2),
           label = \n(causal\n effect)",
           size=5,
           color = "red", fill = alpha(c("white"),0.8))+
    coord_sf(xlim = c(30, 100), 
           ylim = c(40, 90))

Intuition

Hypothetical tutoring program

The \(\delta\) measures the gap in outcome for people on both sides of the cutpoint

\(\delta\) is also called LATE (local average treatment effecrs)

The size of the gap depends on how you draw the lines on each side of the cutoff

The type of lines you choose can change the estimate of \(\delta\)

  • parametric vs. non-parametric
  • bandwidths
  • kernels

Intuition

Hypothetical tutoring program

Show the code
ggplot(tutoring, aes(x = entrance_exam, y = exit_exam, fill = tutoring_text)) +
  geom_point(size = 3, pch = 21, color = "white", alpha = 0.4) +
  geom_vline(xintercept = 70, size = 2, color = "#FFDC00") + 
  labs(x = "Entrance exam score", y = "Exit exam score") + 
  #guides(fill = "none") +
    guides(fill = guide_legend(reverse = TRUE)) +
  geom_smooth(data = filter(tutoring, entrance_exam <= 70),
              method = "lm",
              color="red",
              #aes(color = tutoring_text),
              size = 2, show.legend = FALSE) +
  geom_smooth(data = filter(tutoring, entrance_exam > 70),
              method = "lm", 
              color="black",
              #aes(color = tutoring_text),
              size = 2, show.legend = FALSE) +
  scale_fill_manual(values = c("black", "red"), name = NULL) +
  scale_color_manual(values = c("black", "red")) +
  theme_bw(base_size = 28)+
  annotate(geom = "segment", x = 70, xend = 70, 
           y = effect_control, yend = effect_treatment,
           size = 4, color = "red") +
  annotate(geom = "label", x = 75, y = effect_treatment - (late / 2),
           label = \n(causal\n effect)",
           size=5,
           color = "red", fill = alpha(c("white"),0.8))+
    coord_sf(xlim = c(30, 100), 
           ylim = c(40, 90))

Intuition

Hypothetical tutoring program

Show the code
program_data <- tutoring %>%
  mutate(entrance_centered = 
           entrance_exam - 70)
model1 <- lm(exit_exam ~ 
               entrance_centered + tutoring,
             data = program_data)

x<-tidy(model1)
late<-round(x$estimate[x$term=="tutoringFALSE"],2)
const<-round(x$estimate[x$term=="(Intercept)"],2)


ggplot(tutoring, aes(x = entrance_exam, y = exit_exam, fill = tutoring_text)) +
  geom_point(size = 3, pch = 21, color = "white", alpha = 0.4) +
  geom_vline(xintercept = 70, size = 2, color = "#FFDC00") + 
  labs(x = "Entrance exam score", y = "Exit exam score") + 
  #guides(fill = "none") +
    guides(fill = guide_legend(reverse = TRUE)) +
  geom_smooth(data = filter(tutoring, entrance_exam <= 70),
              method = "lm",
              color="red",
              #aes(color = tutoring_text),
              size = 2, show.legend = FALSE) +
  geom_smooth(data = filter(tutoring, entrance_exam > 70),
              method = "lm", 
              color="black",
              #aes(color = tutoring_text),
              size = 2, show.legend = FALSE) +
  scale_fill_manual(values = c("black", "red"), name = NULL) +
  scale_color_manual(values = c("black", "red")) +
  theme_bw(base_size = 28)+
  annotate(geom = "segment", x = 70, xend = 70, 
           y = effect_control, yend = effect_treatment,
           size = 4, color = "red") +
  annotate(geom = "label", x = 75, y = const + (late / 2),
           label = paste("δ=", late),
           size=5,
           color = "red", fill = alpha(c("white"),0.8))+
    coord_sf(xlim = c(30, 100), 
           ylim = c(40, 90))

Intuition

Hypothetical tutoring program

Show the code
# 1. Fit cubic models
fit_control <- lm(exit_exam ~ poly(entrance_exam, 3, raw = TRUE),
                  data = filter(tutoring, entrance_exam < 70))

fit_treat <- lm(exit_exam ~ poly(entrance_exam, 3, raw = TRUE),
                data = filter(tutoring, entrance_exam >= 70))

# 2. Get tidy versions
tidy_control <- tidy(fit_control)
tidy_treat <- tidy(fit_treat)

cutoff <- 70
effect_control <- predict(fit_control, newdata = data.frame(entrance_exam = cutoff))
effect_treatment <- predict(fit_treat, newdata = data.frame(entrance_exam = cutoff))
delta <- round(effect_treatment-effect_control, 2)


# Create prediction data for plotting
x_vals <- tibble(entrance_exam = seq(30, 100, length.out = 300))

preds <- x_vals %>%
  mutate(
    exit_exam = ifelse(
      entrance_exam < 70,
      predict(fit_control, newdata = x_vals),
      predict(fit_treat, newdata = x_vals)
    )
  )

ggplot(tutoring, aes(x = entrance_exam, y = exit_exam, fill = tutoring_text)) +
  geom_point(size = 3, pch = 21, color = "white", alpha = 0.4) +
  geom_line(data = preds, aes(x = entrance_exam, y = exit_exam),
            inherit.aes = FALSE, color = "black", size = 1.5) +
  geom_vline(xintercept = 70, color = "#FFDC00", size = 2) +
  theme_bw(base_size = 28)+
scale_fill_manual(values = c("Tutor" = "red", "No tutor" = "black"), name = NULL,
                  guide = guide_legend(reverse = TRUE))+
  labs(x = "Entrance exam score", y = "Exit exam score")+
    annotate(geom = "label", x = 75, y = const + (late / 2),
           label = paste("δ=", delta),
           size=5,
           color = "red", fill = alpha(c("white"),0.8))+
      coord_sf(xlim = c(30, 100), 
           ylim = c(40, 90))+
      guides(fill = guide_legend(reverse = TRUE))

Intuition

Hypothetical tutoring program

Show the code
library(dplyr)

# Add sine-transformed variable
tutoring <- tutoring |>
  mutate(sin_term = sin(0.2 * entrance_exam))

# Control group: entrance_exam < 70
fit_trig_control <- nls(
  exit_exam ~ a + b * entrance_exam + c * sin(d * entrance_exam),
  data = filter(tutoring, entrance_exam < 70),
  start = list(a = 55, b = 0.4, c = 15, d = 0.4)
)

# Treatment group: entrance_exam >= 70
fit_trig_treat <- nls(
  exit_exam ~ a + b * entrance_exam + c * sin(d * entrance_exam),
  data = filter(tutoring, entrance_exam >= 70),
  start = list(a = 55, b = 0.4, c = 15, d = 0.4)
)

x_vals <- tibble(entrance_exam = seq(30, 100, length.out = 300))

preds_trig <- x_vals |>
  mutate(
    exit_exam = ifelse(
      entrance_exam < 70,
      predict(fit_trig_control, newdata = x_vals),
      predict(fit_trig_treat, newdata = x_vals)
    )
  )

cutoff <- 70

# Create a new data frame for x = 70
cutoff_df <- data.frame(entrance_exam = cutoff)

# Predicted outcome from each model
effect_control <- predict(fit_trig_control, newdata = cutoff_df)
effect_treatment <- predict(fit_trig_treat, newdata = cutoff_df)

# Compute the treatment effect (jump at cutoff)
delta <- round(effect_treatment-effect_control,2)

ggplot(tutoring, aes(x = entrance_exam, y = exit_exam, fill = tutoring_text)) +
  geom_point(size = 3, pch = 21, color = "white", alpha = 0.4) +
  geom_line(data = preds_trig, aes(x = entrance_exam, y = exit_exam),
            inherit.aes = FALSE, color = "black", size = 1.5) +
  geom_vline(xintercept = 70, color = "#FFDC00", size = 2) +
  theme_bw(base_size = 28)+
scale_fill_manual(values = c("Tutor" = "red", "No tutor" = "black"), name = NULL,
                  guide = guide_legend(reverse = TRUE))+
  labs(x = "Entrance exam score", y = "Exit exam score")+
    annotate(geom = "label", x = 75, y = const + (late / 2),
           label = paste("δ=", delta),
           size=5,
           color = "red", fill = alpha(c("white"),0.8))+
      coord_sf(xlim = c(30, 100), 
           ylim = c(40, 90))+
      guides(fill = guide_legend(reverse = TRUE))

Lines and Fit

Parametric line should fit the data pretty well

Non-parametric lines use the data to find the best line, often with windows and moving averages.

For example, one common methos is LOESS (Locally Estimated Scatterplot Smoothing)

Measuring the Gap Parametrically

The gap can be easily measured by centering the running variable around the threshold

Show the code
head(tutoring, 5)

\[ y = \beta_0 + \beta_1 \text{Running Variable (centered)} + \beta_2 \text{Treatment} \]

Measuring the Gap Parametrically

And this is how we can display the results

Show the code
library(modelsummary)
program_data <- tutoring |> 
  mutate(entrance_centered = 
           entrance_exam - 70)

#Running model
model1 <- lm(exit_exam ~  entrance_centered + tutoring, data = program_data)

#Make table
modelsummary(model1, stars = TRUE, gof_map = c("nobs", "r.squared"))
(1)
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
(Intercept) 70.306***
(0.501)
entrance_centered 0.514***
(0.027)
tutoringFALSE -10.968***
(0.802)
Num.Obs. 1000
R2 0.273

Measuring the Gap Parametrically

And this is how we can display the results on the graph

Show the code
x<-tidy(model1)
gap<-round(x$estimate[x$term=="tutoringFALSE"],2)
const<-round(x$estimate[x$term=="(Intercept)"],2)

ggplot(tutoring, aes(x = entrance_exam, y = exit_exam, fill = tutoring_text)) +
  geom_point(size = 3, pch = 21, color = "white", alpha = 0.4) +
  geom_vline(xintercept = 70, size = 2, color = "#FFDC00") + 
  labs(x = "Entrance exam score", y = "Exit exam score") + 
  #guides(fill = "none") +
    guides(fill = guide_legend(reverse = TRUE)) +
  geom_smooth(data = filter(tutoring, entrance_exam <= 70),
              method = "lm",
              color="red",
              #aes(color = tutoring_text),
              size = 2, show.legend = FALSE) +
  geom_smooth(data = filter(tutoring, entrance_exam > 70),
              method = "lm", 
              color="black",
              #aes(color = tutoring_text),
              size = 2, show.legend = FALSE) +
  scale_fill_manual(values = c("black", "red"), name = NULL) +
  scale_color_manual(values = c("black", "red")) +
  theme_bw(base_size = 28)+
  annotate(geom = "segment", x = 70, xend = 70, 
           y = effect_control, yend = effect_treatment,
           size = 4, color = "red") +
  annotate(geom = "label", x = 75, y = const + (late / 2),
           label = paste("δ=", gap),
           size=5,
           color = "red", fill = alpha(c("white"),0.8))+
    coord_sf(xlim = c(30, 100), 
           ylim = c(40, 90))

Measuring the Gap Nonparametrically

We need to calculate the gap nonparametrically using rdrobust

library(rdrobust)
rd_out <- rdrobust(y = tutoring$exit_exam,
                   x = tutoring$entrance_exam,
                   c = 70)

#summary(rd_out)
#Extracting conventional coefficient
gap_rdrobust<-rd_out$coef["Conventional", "Coeff"]

Measuring the Gap Nonparametrically

And this is how we can display the results on the graph

Show the code
model_control <- lm(exit_exam ~ entrance_exam,
                    data = filter(tutoring, entrance_exam <= 70))

model_treat <- lm(exit_exam ~ entrance_exam,
                  data = filter(tutoring, entrance_exam > 70))

effect_control <- predict(model_control, newdata = data.frame(entrance_exam = 70))
effect_treatment <- predict(model_treat, newdata = data.frame(entrance_exam = 70))

late <- round(gap_rdrobust,2)

ggplot(tutoring, aes(x = entrance_exam, y = exit_exam, fill = tutoring_text)) +
  geom_point(size = 3, pch = 21, color = "white", alpha = 0.4) +
  geom_vline(xintercept = 70, size = 2, color = "#FFDC00") + 
  labs(x = "Entrance exam score", y = "Exit exam score") + 
  #guides(fill = "none") +
    guides(fill = guide_legend(reverse = TRUE)) +
  geom_smooth(data = filter(tutoring, entrance_exam <= 70),
              method = "loess",
              color="red",
              #aes(color = tutoring_text),
              size = 2, show.legend = FALSE) +
  geom_smooth(data = filter(tutoring, entrance_exam > 70),
              method = "loess", 
              color="black",
              #aes(color = tutoring_text),
              size = 2, show.legend = FALSE) +
  scale_fill_manual(values = c("black", "red"), name = NULL) +
  scale_color_manual(values = c("black", "red")) +
  theme_bw(base_size = 28)+
  annotate(geom = "segment", x = 70, xend = 70, 
           y = effect_control, yend = effect_treatment,
           size = 4, color = "red") +
    annotate(geom = "label", x = 75, y = const + (late / 2),
           label = paste("δ=", late),
           size=5,
           color = "red", fill = alpha(c("white"),0.8))+
    coord_sf(xlim = c(30, 100), 
           ylim = c(40, 90))

Bandwidths

What we care about is the area right around the cutoff.

The observations far away do not matter because they are too diffent and thus, they are not comparable

That is why we need to focus on bandwidth or the window around cutoff

Bandwidths

These are the observations within a 5-point bandwidth

Show the code
#Marking obs. within a 5-point bandwidth
tutoring <- tutoring %>%
  mutate(
    in_bw_5 = abs(entrance_exam - 70) <= 5,
    in_bw_10 = abs(entrance_exam - 70) <= 10
  )


#Running model
program_data_bw_5<-subset(tutoring, in_bw_5==TRUE)
model2 <- lm(exit_exam ~  entrance_centered + tutoring, data = program_data_bw_5)

x<-tidy(model2)
gap<-round(x$estimate[x$term=="tutoringFALSE"],2)
const<-round(x$estimate[x$term=="(Intercept)"],2)


ggplot(tutoring, aes(x = entrance_exam, y = exit_exam, fill = tutoring_text)) +
  geom_point(size = 3, pch = 21, color = "white", alpha = 0.2) +
  geom_vline(xintercept = 70, size = 2, color = "#FFDC00") + 
  labs(x = "Entrance exam score", y = "Exit exam score") + 
  #guides(fill = "none") +
    guides(fill = guide_legend(reverse = TRUE)) +
  theme_bw(base_size = 28)+
    geom_point(data = filter(tutoring, in_bw_5),
           aes(x = entrance_exam, y = exit_exam),
           size = 3, pch = 21, color = "white", alpha = 1)+
geom_smooth(data = filter(tutoring, in_bw_5 & entrance_exam <= 70),
            method = "lm",
            color = "red",
            size = 2,
            show.legend = FALSE) +
geom_smooth(data = filter(tutoring, in_bw_5 & entrance_exam > 70),
            method = "lm",
            color = "black",
            size = 2,
            show.legend = FALSE)+
  scale_fill_manual(values = c("black", "red"), name = NULL) +
  scale_color_manual(values = c("black", "red")) +
  annotate(geom = "segment", x = 70, xend = 70, 
           y = const, yend = const + gap,
           size = 4, color = "red") +
    annotate(geom = "label", x = 75, y = const + (gap / 2),
           label = paste("δ=", gap),
           size=5,
           color = "red", fill = alpha(c("white"),0.8))+
    coord_sf(xlim = c(30, 100), 
           ylim = c(40, 90))

Bandwidths

These are the observations within a 10-point bandwidth

Show the code
#Running model
program_data_bw_10<-subset(tutoring, in_bw_10==TRUE)
model3 <- lm(exit_exam ~  entrance_centered + tutoring, data = program_data_bw_10)

x<-tidy(model3)
gap<-round(x$estimate[x$term=="tutoringFALSE"],2)
const<-round(x$estimate[x$term=="(Intercept)"],2)


ggplot(tutoring, aes(x = entrance_exam, y = exit_exam, fill = tutoring_text)) +
  geom_point(size = 3, pch = 21, color = "white", alpha = 0.2) +
  geom_vline(xintercept = 70, size = 2, color = "#FFDC00") + 
  labs(x = "Entrance exam score", y = "Exit exam score") + 
  #guides(fill = "none") +
    guides(fill = guide_legend(reverse = TRUE)) +
  scale_fill_manual(values = c("black", "red"), name = NULL) +
  scale_color_manual(values = c("black", "red")) +
  theme_bw(base_size = 28)+
    geom_point(data = filter(tutoring, in_bw_10),
           aes(x = entrance_exam, y = exit_exam),
           size = 3, pch = 21, color = "white", alpha = 1)+
  geom_smooth(data = filter(tutoring, in_bw_10 & entrance_exam <= 70),
            method = "lm",
            color = "red",
            size = 2,
            show.legend = FALSE) +
geom_smooth(data = filter(tutoring, in_bw_10 & entrance_exam > 70),
            method = "lm",
            color = "black",
            size = 2,
            show.legend = FALSE)+
  annotate(geom = "segment", x = 70, xend = 70, 
           y = const, yend = const + gap,
           size = 4, color = "red") +
    annotate(geom = "label", x = 75, y = const + (gap / 2),
           label = paste("δ=", gap),
           size=5,
           color = "red", fill = alpha(c("white"),0.8))+
    coord_sf(xlim = c(30, 100), 
           ylim = c(40, 90))

Bandwidths

Optimal Bandwidth

But which ones do we choose?

The rdrobust package allows us to decide on the bandwidth:

Show the code
rd_out <- rdrobust(y = tutoring$exit_exam,
                   x = tutoring$entrance_exam,
                   c = 70)

bw_opt <- rd_out$bws[1]  # optimal bandwidth for conventional estimate
bw_opt
[1] 7.615756

Bandwidths

Optimal Bandwidth

These are the results using the 7.62 bandwidth.

Show the code
#Running model
program_data_opt<-subset(tutoring, abs(entrance_centered)<bw_opt)
model4 <- lm(exit_exam ~  entrance_centered + tutoring, data = program_data_opt)

x<-tidy(model4)
gap<-round(x$estimate[x$term=="tutoringFALSE"],2)
const<-round(x$estimate[x$term=="(Intercept)"],2)


ggplot(tutoring, aes(x = entrance_exam, y = exit_exam, fill = tutoring_text)) +
  geom_point(size = 3, pch = 21, color = "white", alpha = 0.2) +
  geom_vline(xintercept = 70, size = 2, color = "#FFDC00") + 
  labs(x = "Entrance exam score", y = "Exit exam score") + 
  #guides(fill = "none") +
    guides(fill = guide_legend(reverse = TRUE)) +
  scale_fill_manual(values = c("black", "red"), name = NULL) +
  scale_color_manual(values = c("black", "red")) +
  theme_bw(base_size = 28)+
    geom_point(data = program_data_opt,
           aes(x = entrance_exam, y = exit_exam),
           size = 3, pch = 21, color = "white", alpha = 1)+
    geom_smooth(data = filter(program_data_opt, entrance_exam <= 70),
              method = "lm",
              color = "red",
              size = 2,
              show.legend = FALSE) +
  geom_smooth(data = filter(program_data_opt, entrance_exam > 70),
              method = "lm",
              color = "black",
              size = 2,
              show.legend = FALSE) +
  annotate(geom = "segment", x = 70, xend = 70, 
           y = const, yend = const + gap,
           size = 4, color = "red") +
    annotate(geom = "label", x = 75, y = const + (gap / 2),
           label = paste("δ=", gap),
           size=5,
           color = "red", fill = alpha(c("white"),0.8))+
    coord_sf(xlim = c(30, 100), 
           ylim = c(40, 90))

Bandwidths

It’s a good idea to have multiple bandwidths

  • some that make sense
  • some that are decided by the optimal bandwidth algorithm

Bandwidths

Add either table or graph

Kernel

The other important part of RD is choosing the kernel

Kernel = method for assigning importance to observations based on distance to the cutoff

Bandwidths

Kernel: Uniform

Show the code
library(ggnewscale)
# Define the boxcar (uniform) kernel function
rec <- function(x) (abs(x) <= 1) * 0.5

# Set cutoff and bandwidth
cutoff <- 70
bandwidth <- 40

# For rectangular kernel overlay
kernel_df <- data.frame(
  entrance_exam = seq(cutoff - bandwidth - 10, cutoff + bandwidth + 10, length.out = 200)
)
kernel_df$u <- (kernel_df$entrance_exam - cutoff) / bandwidth
kernel_df$boxcar_weight <- rec(kernel_df$u)
kernel_df$boxcar_weight_rescaled <- kernel_df$boxcar_weight * 50 + 40  # visual rescaling

# Apply kernel weights to main data
tutoring$boxcar_weight <- with(tutoring, {
  u <- (entrance_exam - cutoff) / bandwidth
  rec(u)
})

# Run weighted regression
model4 <- lm(exit_exam ~ entrance_centered + tutoring, 
             data = tutoring, 
             weights = boxcar_weight)

x <- tidy(model4)
gap <- round(x$estimate[x$term == "tutoringFALSE"], 2)
const <- round(x$estimate[x$term == "(Intercept)"], 2)

# Plot with rectangular kernel overlay
ggplot() + 
  geom_point(data = tutoring, 
             aes(x = entrance_exam, 
                 y = exit_exam, 
                 fill = tutoring_text,
                 size = boxcar_weight),
             shape = 21, 
             alpha = 0.2) +
  geom_vline(xintercept = 70, size = 2, color = "#FFDC00") + 
  labs(x = "Entrance exam score", y = "Exit exam score") + 
  geom_smooth(
    data = filter(tutoring, entrance_exam <= 70),
    aes(x = entrance_exam, y = exit_exam),
    method = "lm",
    color = alpha("red", 0.2),
    fill = alpha("red", 0.2),
    size = 2,
    show.legend = FALSE
  ) +
  geom_smooth(
    data = filter(tutoring, entrance_exam > 70),
    aes(x = entrance_exam, y = exit_exam),
    method = "lm",
    color = alpha("black", 0.2),
    fill = alpha("black", 0.2),
    alpha = 0.2,
    size = 2,
    show.legend = FALSE
  ) +
  scale_fill_manual(values = c("black", "red"), name = NULL) +
  scale_color_manual(values = c("black", "red")) +
  scale_size_continuous(name = "Uniform\nWeight", range = c(2, 6)) +
  theme_bw(base_size = 28) +
  coord_cartesian(xlim = c(20, 120), ylim = c(40, 90)) +
  new_scale_color() +
  
  # Rectangle-shaped kernel overlay using geom_segment()
  geom_segment(
    aes(x = cutoff - bandwidth, xend = cutoff + bandwidth,
        y = 0.5 * 50 + 40, yend = 0.5 * 50 + 40,
        color = "Boxcar\nKernel"),
    size = 2
  ) +
  geom_segment(aes(x = cutoff - bandwidth, xend = cutoff - bandwidth,
                 y = 40, yend = 0.5 * 50 + 40),
             color = "blue", size = 1) +
  geom_segment(aes(x = cutoff + bandwidth, xend = cutoff + bandwidth,
                 y = 40, yend = 0.5 * 50 + 40),
             color = "blue", size = 1)+
  
  scale_color_manual(name = NULL, values = c("Boxcar\nKernel" = "blue"))+
    annotate(geom = "segment", x = 70, xend = 70, 
           y = const, yend = const + gap,
           size = 4, color = "red") +
  annotate(geom = "label", x = 75, y = const + (gap / 2),
           label = paste("δ=", gap),
           size = 5,
           color = "red", fill = alpha("white", 0.8))+
    guides(
  size = guide_legend(order = 1),
  fill = guide_legend(order = 2),
  color = guide_legend(order = 3)
)

Bandwidths

Kernel: Triangular

Show the code
# Set cutoff and bandwidth
cutoff <- 70
bandwidth <- 40  # adjust as needed

# Create a data frame to draw the triangular kernel
kernel_df <- data.frame(
  entrance_exam = seq(cutoff - bandwidth, cutoff + bandwidth, length.out = 200)
)

# Triangular kernel weights for visualization
kernel_df$tri_weight <- with(kernel_df, {
  u <- (entrance_exam - cutoff) / bandwidth
  ifelse(abs(u) <= 1, 1 - abs(u), 0)
})

# Rescale for visual overlay
kernel_df$tri_weight_rescaled <- kernel_df$tri_weight * 50 + 40  # adjust scaling as needed

# Compute triangular weights in main dataset
tutoring$tri_weight <- with(tutoring, {
  u <- (entrance_exam - cutoff) / bandwidth
  ifelse(abs(u) <= 1, 1 - abs(u), 0)
})

# Run weighted regression using triangular kernel
model4 <- lm(exit_exam ~ entrance_centered + tutoring, 
             data = tutoring, 
             weights = tri_weight)

# Extract estimates for annotation
x <- tidy(model4)
gap <- round(x$estimate[x$term == "tutoringFALSE"], 2)
const <- round(x$estimate[x$term == "(Intercept)"], 2)


ggplot()+ 
  geom_point(data = tutoring, 
             aes(x = entrance_exam, 
                 y = exit_exam, 
                 fill = tutoring_text,
                 size = tri_weight),
             shape = 21, 
             alpha = 0.2) +
  geom_vline(xintercept = 70, size = 2, color = "#FFDC00") + 
  labs(x = "Entrance exam score", y = "Exit exam score") + 
geom_smooth(
  data = filter(tutoring, entrance_exam <= 70),
  aes(x = entrance_exam, y = exit_exam),
  method = "lm",
  color = alpha("red", 0.2),        # line color
  fill = alpha("red", 0.2),          # confidence interval fill
  size = 2,
  show.legend = FALSE
) +
geom_smooth(
  data = filter(tutoring, entrance_exam > 70),
  aes(x = entrance_exam, y = exit_exam),
  method = "lm",
  color = alpha("black", 0.2),        # line color
  fill = alpha("black", 0.2),          # confidence interval fill
  alpha = 0.2,
  size = 2,
  show.legend = FALSE
) +
  scale_fill_manual(values = c("black", "red"), name = NULL) +
  scale_color_manual(values = c("black", "red")) +
  scale_size_continuous(name = "Triangular\nWeight", range = c(2, 6)) +  # legend control
  theme_bw(base_size = 28)+
  annotate(geom = "segment", x = 70, xend = 70, 
           y = const, yend = const + gap,
           size = 4, color = "red") +
    annotate(geom = "label", x = 75, y = const + (gap / 2),
           label = paste("δ=", gap),
           size=5,
           color = "red", fill = alpha(c("white"),0.8))+
  coord_cartesian(xlim = c(20, 120), ylim = c(40, 90)) +
  new_scale_color() +
  geom_line(
    data = kernel_df,
    aes(x = entrance_exam, 
        y = tri_weight_rescaled,
        color = "Triangular\n Kernel"), # <-- also mapped to show dashed line in legend
    inherit.aes = FALSE,
    size = 2
  )+
  scale_color_manual(name = NULL, values = c("Triangular\n Kernel" = "blue"))+
    guides(
  size = guide_legend(order = 1),
  fill = guide_legend(order = 2),
  color = guide_legend(order = 3)
)

Bandwidths

Kernel: Epanechnikov

Show the code
kernel_df <- data.frame(
  entrance_exam = seq(cutoff - bandwidth, cutoff + bandwidth, length.out = 200)
)

u <- (kernel_df$entrance_exam - cutoff) / bandwidth
kernel_df$epa_weight <- ifelse(abs(u) <= 1, 0.75 * (1 - u^2), 0)
kernel_df$epa_weight_rescaled <- kernel_df$epa_weight * 50 + 40  # adjust as needed


# Compute Epanechnikov weights
tutoring$epa_weight <- with(tutoring, {
  u <- (entrance_exam - cutoff) / bandwidth
  ifelse(abs(u) <= 1, 0.75 * (1 - u^2), 0)
})

model4 <- lm(exit_exam ~ entrance_centered + tutoring, 
             data = tutoring, 
             weights = epa_weight)

x<-tidy(model4)
gap<-round(x$estimate[x$term=="tutoringFALSE"],2)
const<-round(x$estimate[x$term=="(Intercept)"],2)


ggplot()+ 
  geom_point(data = tutoring, 
             aes(x = entrance_exam, 
                 y = exit_exam, 
                 fill = tutoring_text,
                 size = epa_weight),
             shape = 21, 
             alpha = 0.2) +
  geom_vline(xintercept = 70, size = 2, color = "#FFDC00") + 
  labs(x = "Entrance exam score", y = "Exit exam score") + 
geom_smooth(
  data = filter(tutoring, entrance_exam <= 70),
  aes(x = entrance_exam, y = exit_exam),
  method = "lm",
  color = alpha("red", 0.2),        # line color
  fill = alpha("red", 0.2),          # confidence interval fill
  size = 2,
  show.legend = FALSE
) +
geom_smooth(
  data = filter(tutoring, entrance_exam > 70),
  aes(x = entrance_exam, y = exit_exam),
  method = "lm",
  color = alpha("black", 0.2),        # line color
  fill = alpha("black", 0.2),          # confidence interval fill
  alpha = 0.2,
  size = 2,
  show.legend = FALSE
) +
  scale_fill_manual(values = c("black", "red"), name = NULL) +
  scale_color_manual(values = c("black", "red")) +
  scale_size_continuous(name = "Epanechnikov\nWeight", range = c(2, 6)) +  # legend control
  theme_bw(base_size = 28)+
  annotate(geom = "segment", x = 70, xend = 70, 
           y = const, yend = const + gap,
           size = 4, color = "red") +
    annotate(geom = "label", x = 75, y = const + (gap / 2),
           label = paste("δ=", gap),
           size=5,
           color = "red", fill = alpha(c("white"),0.8))+
  coord_cartesian(xlim = c(20, 120), ylim = c(40, 90)) +
  new_scale_color() +
  geom_line(
    data = kernel_df,
    aes(x = entrance_exam, 
        y = epa_weight_rescaled,
        color = "Epanechnikov\n Kernel"), # <-- also mapped to show dashed line in legend
    inherit.aes = FALSE,
    size = 2
  )+
  scale_color_manual(name = NULL, values = c("Epanechnikov\n Kernel" = "blue"))+
  guides(
  size = guide_legend(order = 1),
  fill = guide_legend(order = 2),
  color = guide_legend(order = 3)
)

Limitations and Caveats

The gap at the bandwidth is the LATE (Local Average Treatment Effect).

Thus, the findings have high internal validity and low external validity:

  • the findings are limited in scope
  • these are local empirical results

Caveat 1

Manipulation at the threshold

People might be aware of the cutoff so they might modify their behavior to get in/out of program

For example, students could try to get a slightly lower grade to benefit from the tutoring program.

This means that people at the cutoff are not representative of the control and treatment group.

Caveat 1

Manipulation Test

The way to test for manipulation at the cutoff is the McCrary density test.

This is an example of Non-manipulation

Show the code
tutoring <- tibble(
  id = 1:num_students,
  entrance_exam = rbeta(num_students, shape1 = 7, shape2 = 2),
  exit_exam = rbeta(num_students, shape1 = 5, shape2 = 3)
) |> 
  mutate(entrance_exam = entrance_exam * 100,
         tutoring = entrance_exam <= 70) |> 
  mutate(exit_exam = exit_exam * 40 + 10 * tutoring + entrance_exam / 2) |> 
  mutate(tutoring_fuzzy = ifelse(entrance_exam > 60 & entrance_exam < 80,
                                 sample(c(TRUE, FALSE), n(), replace = TRUE),
                                 tutoring)) |> 
  mutate(tutoring_text = factor(tutoring, levels = c(FALSE, TRUE),
                                labels = c("No tutor", "Tutor")),
         tutoring_fuzzy_text = factor(tutoring_fuzzy, levels = c(FALSE, TRUE),
                                      labels = c("No tutor", "Tutor"))) |> 
  mutate(entrance_centered = entrance_exam - 70)


library(rddensity)
mtest <- rddensity(X = tutoring$entrance_exam, c=70)
no_man <- rdplotdensity(mtest, X = tutoring$entrance_exam)$Estplot +
  geom_vline(xintercept=70, linetype="dashed") +
  labs(x="CD4 Count", y="Density")

Caveat 1

Manipulation Test

The way to test for manipulation at the cutoff is the McCrary density test.

This is an example of Manipulation

Show the code
set.seed(123)  # for reproducibility

tutoring2 <- tibble(
  id = 1:num_students,
  entrance_exam = rbeta(num_students, shape1 = 7, shape2 = 2),
  exit_exam = rbeta(num_students, shape1 = 5, shape2 = 3)
) |> 
  mutate(entrance_exam = entrance_exam * 100) |> 

  # 🔥 Force 30% of observations between 70–75 to move just below 70
  mutate(entrance_exam = ifelse(entrance_exam > 70 & entrance_exam < 75 & runif(n()) < 0.3,
                                runif(n(), 65, 69.9),  # move just below
                                entrance_exam)) |> 
  
  # ❌ Thin out data just above 70–75 range to enhance visual gap
  filter(!(entrance_exam > 70 & entrance_exam < 75 & runif(n()) < 0.3)) |> 
  
  mutate(
    tutoring = entrance_exam <= 70,
    exit_exam = exit_exam * 40 + 10 * tutoring + entrance_exam / 2,
    tutoring_fuzzy = ifelse(entrance_exam > 60 & entrance_exam < 80,
                            sample(c(TRUE, FALSE), n(), replace = TRUE),
                            tutoring),
    tutoring_text = factor(tutoring, levels = c(FALSE, TRUE),
                           labels = c("No tutor", "Tutor")),
    tutoring_fuzzy_text = factor(tutoring_fuzzy, levels = c(FALSE, TRUE),
                                 labels = c("No tutor", "Tutor")),
    entrance_centered = entrance_exam - 70
  )

library(rddensity)
mtest <- rddensity(X = tutoring2$entrance_exam, c=70)
man<-rdplotdensity(mtest,X = tutoring2$entrance_exam)$Estplot+
  geom_vline(xintercept=70, linetype="dashed") +
  labs(x="CD4 Count", y="Density")